train.csv - the training set
test.csv - the test set
sample_submission.csv - a sample submission file in the correct format
id - id of the creature
bone_length - average length of bone in the creature, normalized between 0 and 1
rotting_flesh - percentage of rotting flesh in the creature
hair_length - average hair length, normalized between 0 and 1
has_soul - percentage of soul in the creature
color - dominant color of the creature: ‘white’,‘black’,‘clear’,‘blue’,‘green’,‘blood’
type - target variable: ‘Ghost’, ‘Goblin’, and ‘Ghoul’
In 2010, Kaggle was founded as a platform for predictive modelling and analytics competitions on which companies and researchers post their data and statisticians and data miners from all over the world compete to produce the best models.
This crowdsourcing approach relies on the fact that there are countless strategies that can be applied to any predictive modelling task and it is impossible to know at the outset which technique or analyst will be most effective. Kaggle also hosts recruiting competitions in which data scientists compete for a chance to interview at leading data science companies like Facebook, Winton Capital, and Walmart.
## Library required ##
# For data manipulation and tidying
library(MASS)
library(tidyr)
library(dplyr)
# For data visualizations
library(ggplot2)
library(plotly)
# For modeling and predictions
library(caret)
library(glmnet)
library(ranger)
library(e1071)
library(clValid)
# Download databases
# setwd("~/Desktop/Kaggle/1. Halloween")
# thanks https://www.kaggle.com/amberthomas for many ideas in: https://www.kaggle.com/amberthomas/ghouls-goblins-and-ghosts-boo/ghosts-goblins-and-ghouls-oh-my
train <- read.csv("train.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
train$Dataset <- "train"
test <- read.csv('test.csv', header = TRUE, stringsAsFactors = FALSE)
test$Dataset <- "test"
full <- bind_rows(train, test)
head(train)
## id bone_length rotting_flesh hair_length has_soul color type Dataset
## 1 0 0.3545122 0.3508390 0.4657609 0.7811417 clear Ghoul train
## 2 1 0.5755599 0.4258684 0.5314014 0.4398989 green Goblin train
## 3 2 0.4678755 0.3543304 0.8116161 0.7912250 black Ghoul train
## 4 4 0.7766525 0.5087225 0.6367656 0.8844637 black Ghoul train
## 5 5 0.5661166 0.8758618 0.4185937 0.6364378 green Ghost train
## 6 7 0.4056797 0.2532775 0.4414197 0.2803238 green Goblin train
summary(train)
## id bone_length rotting_flesh hair_length
## Min. : 0.0 Min. :0.06103 Min. :0.09569 Min. :0.1346
## 1st Qu.:205.5 1st Qu.:0.34001 1st Qu.:0.41481 1st Qu.:0.4074
## Median :458.0 Median :0.43489 Median :0.50155 Median :0.5386
## Mean :443.7 Mean :0.43416 Mean :0.50685 Mean :0.5291
## 3rd Qu.:678.5 3rd Qu.:0.51722 3rd Qu.:0.60398 3rd Qu.:0.6472
## Max. :897.0 Max. :0.81700 Max. :0.93247 Max. :1.0000
## has_soul color type
## Min. :0.009402 Length:371 Length:371
## 1st Qu.:0.348002 Class :character Class :character
## Median :0.466372 Mode :character Mode :character
## Mean :0.471392
## 3rd Qu.:0.600610
## Max. :0.935721
## Dataset
## Length:371
## Class :character
## Mode :character
##
##
##
# Define factor
factor_V <- c('id', 'color', 'type')
train[factor_V] <- lapply(train[factor_V], function(x) as.factor(x))
str(train)
## 'data.frame': 371 obs. of 8 variables:
## $ id : Factor w/ 371 levels "0","1","2","4",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ bone_length : num 0.355 0.576 0.468 0.777 0.566 ...
## $ rotting_flesh: num 0.351 0.426 0.354 0.509 0.876 ...
## $ hair_length : num 0.466 0.531 0.812 0.637 0.419 ...
## $ has_soul : num 0.781 0.44 0.791 0.884 0.636 ...
## $ color : Factor w/ 6 levels "black","blood",..: 4 5 1 1 5 5 6 4 3 6 ...
## $ type : Factor w/ 3 levels "Ghost","Ghoul",..: 2 3 2 2 1 3 3 2 1 1 ...
## $ Dataset : chr "train" "train" "train" "train" ...
library(corrplot)
train_correlation <- train %>% select(bone_length:has_soul)
train_correlation <- cor(train_correlation)
# corrplot(train_correlation, method="circle")
# data
corrplot.mixed(train_correlation)
#cor(train_correlation)
pairs(train[,2:5],
col = train$type,
labels = c("Bone Length", "Rotting Flesh", "Hair Length", "Soul"))
par(mfrow=c(1,3))
hist(train$bone_length,col="#3090C7", main = "bone_length")
hist(train$rotting_flesh,col="#3090C7", main = "rotting_flesh")
hist(train$has_soul,col="#3090C7", main = "has_soul")
par(mfrow=c(1,2))
plot(train$color,col="#3090C7", main = "Color")
plot(train$type,col="#3090C7", main = "Type")
# Plot using plotly
p <- plot_ly(train, x = train$bone_length, y = train$rotting_flesh, z = train$has_soul, type = "scatter3d", mode = "markers", color=train$type)
p
ggplot(train, aes(color, fill = type)) + geom_bar()
Our feature don’t look easy to distinguisth… let’s try to create better features.
By multiplying our variables together we should obtain better features to distinguish the classes.
# Sep1
full <- full %>%
mutate(sep1 = bone_length * hair_length * has_soul,
sep1 = sep1 / max(sep1))
ggplot(full, aes(id, sep1, color = type)) +
geom_point()
full <- full %>%
mutate(sep2 = sep1 / (rotting_flesh),
sep2 = sep2 / max(sep2))
ggplot(full, aes(id, sep2, color = type)) +
geom_point()
## Warning: Removed 1 rows containing missing values (geom_point).
full <- full %>%
mutate(allfeatures = ((bone_length^2) * (hair_length^4) * (has_soul^4))/rotting_flesh)
ggplot(full, aes(id, sep1, color = type)) +
geom_point()
full <- full %>%
mutate(hair_soul = hair_length * has_soul)
full <- full %>%
mutate(bone_flesh = bone_length * rotting_flesh,
bone_hair = bone_length * hair_length,
bone_soul = bone_length * has_soul,
flesh_hair = rotting_flesh * hair_length,
flesh_soul = rotting_flesh * has_soul)
# Cross-validation dataset
train_cv <- train
# Build the 3 levels
#Customer_cv$Long_term_value<-cut(Customer_cv$sum, c(0,100, 400, 40000))
#levels(Customer_cv$Long_term_value) <- c('low_value', 'medium_value', 'high_value')
# Set the target variable as a factor
#Customer_cv$Long_term_value <- as.factor(Customer_cv$Long_term_value)
#Customer_cv <- Customer_cv %>% select(age:Long_term_value)
# cross-validation
# library(caret)
train_control<- trainControl(method="cv", number=8, repeats=5)
head(train_control)
## $method
## [1] "cv"
##
## $number
## [1] 8
##
## $repeats
## [1] 5
##
## $search
## [1] "grid"
##
## $p
## [1] 0.75
##
## $initialWindow
## NULL
library("rpart.plot")
## Loading required package: rpart
fit <- rpart(type ~ bone_length + rotting_flesh + hair_length + has_soul + color,
method = "class",
data = train_cv,
control = rpart.control(minsplit = 50),
parms = list(split='information'))
rpart.plot(fit, type=2, extra = 1)
library("rpart")
library("rpart.plot")
# train the model
rpartmodel<- train(type~bone_length + rotting_flesh + hair_length + has_soul + color, data=train_cv, trControl=train_control, method="rpart", control = rpart.control(minsplit = 1), parms = list(split='information'))
# test to train the tree model with PCA:
rpartmodel<- train(type~bone_length + rotting_flesh + hair_length + has_soul, data=train_cv, trControl=train_control, method="rpart", control = rpart.control(minsplit = 1), preProcess = "pca", parms = list(split='information'))
# make predictions
predictions <- predict(rpartmodel,train_cv)
train_cv_tree<- cbind(train_cv,predictions)
# summarize results
confusionMatrix<- confusionMatrix(train_cv_tree$predictions,train_cv_tree$type)
confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction Ghost Ghoul Goblin
## Ghost 92 0 6
## Ghoul 0 79 16
## Goblin 25 50 103
##
## Overall Statistics
##
## Accuracy : 0.7385
## 95% CI : (0.6907, 0.7825)
## No Information Rate : 0.3477
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6074
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Ghost Class: Ghoul Class: Goblin
## Sensitivity 0.7863 0.6124 0.8240
## Specificity 0.9764 0.9339 0.6951
## Pos Pred Value 0.9388 0.8316 0.5787
## Neg Pred Value 0.9084 0.8188 0.8860
## Prevalence 0.3154 0.3477 0.3369
## Detection Rate 0.2480 0.2129 0.2776
## Detection Prevalence 0.2642 0.2561 0.4798
## Balanced Accuracy 0.8814 0.7731 0.7596
library(e1071)
library(rminer)
# train the model
e1071model <- train(type~bone_length + rotting_flesh + hair_length + has_soul + color, data=train_cv, trControl=train_control, method="nb")
## Loading required package: klaR
# make predictions
predictions <- predict(e1071model,train_cv)
e1071modelbinded <- cbind(train_cv,predictions)
# summarize results
confusionMatrix<- confusionMatrix(e1071modelbinded$predictions,e1071modelbinded$type)
confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction Ghost Ghoul Goblin
## Ghost 91 0 7
## Ghoul 0 100 24
## Goblin 26 29 94
##
## Overall Statistics
##
## Accuracy : 0.7682
## 95% CI : (0.7219, 0.8102)
## No Information Rate : 0.3477
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.6515
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: Ghost Class: Ghoul Class: Goblin
## Sensitivity 0.7778 0.7752 0.7520
## Specificity 0.9724 0.9008 0.7764
## Pos Pred Value 0.9286 0.8065 0.6309
## Neg Pred Value 0.9048 0.8826 0.8604
## Prevalence 0.3154 0.3477 0.3369
## Detection Rate 0.2453 0.2695 0.2534
## Detection Prevalence 0.2642 0.3342 0.4016
## Balanced Accuracy 0.8751 0.8380 0.7642
library(class)
# train the model
knnFit <- train(type ~ bone_length + rotting_flesh + hair_length + has_soul + color, data = train_cv, method = "knn", trControl = train_control, preProcess = c("center","scale"), tuneLength = 10)
# make predictions
predictions<- predict(knnFit,train_cv)
knnFit_bind <- cbind(train_cv,predictions)
# summarize results
confusionMatrix<- confusionMatrix(knnFit_bind$predictions,knnFit_bind$type)
confusionMatrix
## Confusion Matrix and Statistics
##
## Reference
## Prediction Ghost Ghoul Goblin
## Ghost 102 1 14
## Ghoul 2 100 23
## Goblin 13 28 88
##
## Overall Statistics
##
## Accuracy : 0.7817
## 95% CI : (0.7361, 0.8227)
## No Information Rate : 0.3477
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.6723
## Mcnemar's Test P-Value : 0.8349
##
## Statistics by Class:
##
## Class: Ghost Class: Ghoul Class: Goblin
## Sensitivity 0.8718 0.7752 0.7040
## Specificity 0.9409 0.8967 0.8333
## Pos Pred Value 0.8718 0.8000 0.6822
## Neg Pred Value 0.9409 0.8821 0.8471
## Prevalence 0.3154 0.3477 0.3369
## Detection Rate 0.2749 0.2695 0.2372
## Detection Prevalence 0.3154 0.3369 0.3477
## Balanced Accuracy 0.9064 0.8359 0.7687
# load the library
library(mlbench)
# load the dataset
comp.train <- train %>% select(bone_length:type)
#data(PimaIndiansDiabetes)
# prepare training scheme
comp.control <- trainControl(method="repeatedcv", number=10, repeats=3)
# train the LVQ model (Learning Vector Quantization)
set.seed(7)
modelLvq <- train(type~., data=comp.train, method="lvq", trControl=comp.control)
# train the SVM model
set.seed(7)
modelSvm <- train(type~., data=comp.train, method="svmRadial", trControl=comp.control)
# train tree
set.seed(7)
modeltree <- train(type~., data=comp.train, method="rpart", trControl=comp.control)
# Tree + PCA
set.seed(7)
modeltreepca <- train(type~., data=comp.train, method="rpart", trControl=comp.control, preProcess = "pca", parms = list(split='information'))
# KNN
set.seed(7)
modelknn <- train(type~., data=comp.train, method="knn", trControl=comp.control)
# Bayes
set.seed(7)
modelbayes <- train(type~., data=comp.train, method="nb", trControl=comp.control)
## Warning in FUN(X[[i]], ...): Numerical 0 probability for all classes with
## observation 33
# collect resamples
results <- resamples(list(LVQ=modelLvq, SVM=modelSvm, TREE=modeltree, TREEPCA=modeltreepca, KNN=modelknn, NBayes=modelbayes))
# summarize the distributions
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: LVQ, SVM, TREE, TREEPCA, KNN, NBayes
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LVQ 0.3421 0.5329 0.5946 0.5843 0.6486 0.7368 0
## SVM 0.5135 0.6778 0.7297 0.7171 0.7568 0.8889 0
## TREE 0.5405 0.5833 0.6216 0.6286 0.6757 0.7632 0
## TREEPCA 0.5135 0.6038 0.6623 0.6567 0.6842 0.7838 0
## KNN 0.5833 0.6623 0.6842 0.6934 0.7133 0.8649 0
## NBayes 0.5946 0.6757 0.7027 0.7121 0.7566 0.8378 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## LVQ 0.01247 0.2977 0.3921 0.3749 0.4717 0.6062 0
## SVM 0.26650 0.5172 0.5941 0.5747 0.6336 0.8333 0
## TREE 0.30880 0.3766 0.4342 0.4441 0.5145 0.6463 0
## TREEPCA 0.26570 0.4087 0.4954 0.4856 0.5284 0.6754 0
## KNN 0.36920 0.4925 0.5265 0.5392 0.5698 0.7976 0
## NBayes 0.39080 0.5140 0.5523 0.5670 0.6340 0.7571 0
# boxplots of results
bwplot(results)
# dot plots of results
# dotplot(results)
We use multiple models (of the same kind) to aggregate and predict:
. Bagged CART
. Random Forest
# Example of Bagging algorithms
control <- trainControl(method="repeatedcv", number=10, repeats=3)
seed <- 7
metric <- "Accuracy"
# Bagged CART
set.seed(seed)
fit.treebag <- train(type~., data=comp.train, method="treebag", metric=metric, trControl=control)
# Random Forest
set.seed(seed)
fit.rf <- train(type~., data=comp.train, method="rf", metric=metric, trControl=control)
# summarize results
bagging_results <- resamples(list(treebag=fit.treebag, rf=fit.rf))
summary(bagging_results)
##
## Call:
## summary.resamples(object = bagging_results)
##
## Models: treebag, rf
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## treebag 0.5263 0.6757 0.7027 0.7106 0.7568 0.8158 0
## rf 0.6216 0.6689 0.7297 0.7207 0.7568 0.8378 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## treebag 0.2890 0.5132 0.5523 0.5658 0.6342 0.7241 0
## rf 0.4283 0.5007 0.5941 0.5806 0.6349 0.7568 0
dotplot(bagging_results)
Boosting is as bagging but this time we focus on the mistakes done by the preciding models.
. C5.0
. Stochastic Gradient Boosting
library(mlbench)
library(caret)
library(caretEnsemble)
# Example of Boosting Algorithms
control <- trainControl(method="repeatedcv", number=10, repeats=3)
seed <- 7
metric <- "Accuracy"
# C5.0
set.seed(seed)
## Error in loadNamespace(i, c(lib.loc, .libPaths()), versionCheck = vI[[i]]) : there is no package called ‘partykit’
fit.c50 <- train(type~., data=comp.train, method="C5.0", metric=metric, trControl=control)
# Stochastic Gradient Boosting
set.seed(seed)
fit.gbm <- train(type~., data=comp.train, method="gbm", metric=metric, trControl=control, verbose=FALSE)
# summarize results
boosting_results <- resamples(list(c5.0=fit.c50, gbm=fit.gbm))
summary(boosting_results)
##
## Call:
## summary.resamples(object = boosting_results)
##
## Models: c5.0, gbm
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## c5.0 0.5789 0.6601 0.7162 0.7122 0.7568 0.8378 0
## gbm 0.6486 0.7047 0.7434 0.7393 0.7568 0.8611 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## c5.0 0.3692 0.4911 0.5741 0.5681 0.6344 0.7566 0
## gbm 0.4703 0.5570 0.6134 0.6085 0.6356 0.7919 0
dotplot(boosting_results)
We use models of different types to aggregate and predict.
Example with: Linear Discriminate Analysis (LDA) Classification and Regression Trees (CART) k-Nearest Neighbors (kNN) Support Vector Machine with a Radial Basis Kernel Function (SVM)
# Example of Stacking algorithms
# create submodels
control <- trainControl(method="repeatedcv", number=10, repeats=3, savePredictions=TRUE, classProbs=TRUE)
algorithmList <- c('lda', 'rpart', 'knn', 'svmRadial')
set.seed(seed)
models <- caretList(type~., data=comp.train, trControl=control, methodList=algorithmList)
results <- resamples(models)
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: lda, rpart, knn, svmRadial
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.5676 0.7027 0.7297 0.7303 0.7616 0.8611 0
## rpart 0.5405 0.5833 0.6216 0.6286 0.6757 0.7632 0
## knn 0.5946 0.6554 0.6842 0.6934 0.7193 0.8649 0
## svmRadial 0.5135 0.6842 0.7183 0.7170 0.7568 0.8889 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## lda 0.3495 0.5518 0.5936 0.5948 0.6419 0.7919 0
## rpart 0.3088 0.3766 0.4342 0.4441 0.5145 0.6463 0
## knn 0.3888 0.4808 0.5260 0.5392 0.5801 0.7976 0
## svmRadial 0.2665 0.5265 0.5777 0.5745 0.6346 0.8333 0
dotplot(results)
# correlation between results
modelCor(results)
## lda rpart knn svmRadial
## lda 1.0000000 0.3515136 0.3736171 0.7924729
## rpart 0.3515136 1.0000000 0.1744603 0.2765570
## knn 0.3736171 0.1744603 1.0000000 0.5322076
## svmRadial 0.7924729 0.2765570 0.5322076 1.0000000
splom(results)
Let’s combine the predictions of the classifiers using a simple linear model.
# Thanks: http://machinelearningmastery.com/machine-learning-ensembles-with-r/ but not yet implemented for multiclass problems...
# stack using glm
# stackControl <- trainControl(method="repeatedcv", number=10, repeats=3, savePredictions=TRUE, classProbs=TRUE)
# set.seed(seed)
# stack.glm <- caretStack(models, method="glm", metric="Accuracy", trControl=stackControl)
# print(stack.glm)
# from https://www.kaggle.com/amberthomas/ghouls-goblins-and-ghosts-boo/ghosts-goblins-and-ghouls-oh-my
set.seed(10)
### Clusters Without categorical variables
# Set the seed
set.seed(100)
# Extract creature labels and remove column from dataset
creature_labels <- full$type
full2 <- full
full2$type <- NULL
# Remove categorical variables (id, color, and dataset) from dataset
full2$id <- NULL
full2$color <- NULL
full2$Dataset <- NULL
full2 <- full2 %>% select(bone_length:has_soul,hair_soul:flesh_soul)
# Perform k-means clustering with 3 clusters, repeat 30 times
creature_km_1 <- kmeans(full2, 3, nstart = 30)
train_complete <- full[full$Dataset == 'train', ]
test_complete <- full[full$Dataset == 'test', ]
myControl <- trainControl(
method = "cv",
number = 10,
repeats = 20,
verboseIter = TRUE
)
glm_model <- train(
type ~ bone_length + rotting_flesh + hair_length + has_soul + color + hair_soul + bone_flesh + bone_hair +
bone_soul + flesh_hair + flesh_soul,
method = "glmnet",
tuneGrid = expand.grid(alpha = 0:1,
lambda = seq(0.0001, 1, length = 20)),
data = train_complete,
trControl = myControl
)
## + Fold01: alpha=0, lambda=1
## - Fold01: alpha=0, lambda=1
## + Fold01: alpha=1, lambda=1
## - Fold01: alpha=1, lambda=1
## + Fold02: alpha=0, lambda=1
## - Fold02: alpha=0, lambda=1
## + Fold02: alpha=1, lambda=1
## - Fold02: alpha=1, lambda=1
## + Fold03: alpha=0, lambda=1
## - Fold03: alpha=0, lambda=1
## + Fold03: alpha=1, lambda=1
## - Fold03: alpha=1, lambda=1
## + Fold04: alpha=0, lambda=1
## - Fold04: alpha=0, lambda=1
## + Fold04: alpha=1, lambda=1
## - Fold04: alpha=1, lambda=1
## + Fold05: alpha=0, lambda=1
## - Fold05: alpha=0, lambda=1
## + Fold05: alpha=1, lambda=1
## - Fold05: alpha=1, lambda=1
## + Fold06: alpha=0, lambda=1
## - Fold06: alpha=0, lambda=1
## + Fold06: alpha=1, lambda=1
## - Fold06: alpha=1, lambda=1
## + Fold07: alpha=0, lambda=1
## - Fold07: alpha=0, lambda=1
## + Fold07: alpha=1, lambda=1
## - Fold07: alpha=1, lambda=1
## + Fold08: alpha=0, lambda=1
## - Fold08: alpha=0, lambda=1
## + Fold08: alpha=1, lambda=1
## - Fold08: alpha=1, lambda=1
## + Fold09: alpha=0, lambda=1
## - Fold09: alpha=0, lambda=1
## + Fold09: alpha=1, lambda=1
## - Fold09: alpha=1, lambda=1
## + Fold10: alpha=0, lambda=1
## - Fold10: alpha=0, lambda=1
## + Fold10: alpha=1, lambda=1
## - Fold10: alpha=1, lambda=1
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0, lambda = 0.0527 on full training set
Plot
library(fpc)
plotcluster(full2, creature_km_1$cluster)
table(creature_km_1$cluster, creature_labels)
## creature_labels
## Ghost Ghoul Goblin
## 1 7 39 75
## 2 4 86 24
## 3 106 4 26
Or:
model <- train(
type ~ bone_length + rotting_flesh + hair_length + has_soul + color + hair_soul + bone_flesh + bone_hair +
bone_soul + flesh_hair + flesh_soul,
data = train_complete,
tuneGrid = expand.grid(alpha = 0:1,
lambda = seq(0.0001, 1, length = 20)),
method = "glmnet",
trControl = myControl
)
## + Fold01: alpha=0, lambda=1
## - Fold01: alpha=0, lambda=1
## + Fold01: alpha=1, lambda=1
## - Fold01: alpha=1, lambda=1
## + Fold02: alpha=0, lambda=1
## - Fold02: alpha=0, lambda=1
## + Fold02: alpha=1, lambda=1
## - Fold02: alpha=1, lambda=1
## + Fold03: alpha=0, lambda=1
## - Fold03: alpha=0, lambda=1
## + Fold03: alpha=1, lambda=1
## - Fold03: alpha=1, lambda=1
## + Fold04: alpha=0, lambda=1
## - Fold04: alpha=0, lambda=1
## + Fold04: alpha=1, lambda=1
## - Fold04: alpha=1, lambda=1
## + Fold05: alpha=0, lambda=1
## - Fold05: alpha=0, lambda=1
## + Fold05: alpha=1, lambda=1
## - Fold05: alpha=1, lambda=1
## + Fold06: alpha=0, lambda=1
## - Fold06: alpha=0, lambda=1
## + Fold06: alpha=1, lambda=1
## - Fold06: alpha=1, lambda=1
## + Fold07: alpha=0, lambda=1
## - Fold07: alpha=0, lambda=1
## + Fold07: alpha=1, lambda=1
## - Fold07: alpha=1, lambda=1
## + Fold08: alpha=0, lambda=1
## - Fold08: alpha=0, lambda=1
## + Fold08: alpha=1, lambda=1
## - Fold08: alpha=1, lambda=1
## + Fold09: alpha=0, lambda=1
## - Fold09: alpha=0, lambda=1
## + Fold09: alpha=1, lambda=1
## - Fold09: alpha=1, lambda=1
## + Fold10: alpha=0, lambda=1
## - Fold10: alpha=0, lambda=1
## + Fold10: alpha=1, lambda=1
## - Fold10: alpha=1, lambda=1
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0, lambda = 0.158 on full training set
# # Print model to console
model
## glmnet
##
## 371 samples
## 11 predictor
## 3 classes: 'Ghost', 'Ghoul', 'Goblin'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 333, 333, 335, 334, 334, 335, ...
## Resampling results across tuning parameters:
##
## alpha lambda Accuracy Kappa
## 0 0.00010000 0.7350640 0.6023770
## 0 0.05272632 0.7404734 0.6104269
## 0 0.10535263 0.7489529 0.6232589
## 0 0.15797895 0.7517307 0.6273562
## 0 0.21060526 0.7489529 0.6229377
## 0 0.26323158 0.7488818 0.6228710
## 0 0.31585789 0.7408408 0.6109715
## 0 0.36848421 0.7355066 0.6029439
## 0 0.42111053 0.7381381 0.6069389
## 0 0.47373684 0.7381381 0.6069389
## 0 0.52636316 0.7382843 0.6072288
## 0 0.57898947 0.7382843 0.6072288
## 0 0.63161579 0.7410621 0.6113954
## 0 0.68424211 0.7410621 0.6114630
## 0 0.73686842 0.7410621 0.6114630
## 0 0.78949474 0.7410621 0.6115241
## 0 0.84212105 0.7410621 0.6115241
## 0 0.89474737 0.7410621 0.6115241
## 0 0.94737368 0.7410621 0.6115241
## 0 1.00000000 0.7410621 0.6115241
## 1 0.00010000 0.7300221 0.5945355
## 1 0.05272632 0.7494705 0.6242450
## 1 0.10535263 0.7273985 0.5914999
## 1 0.15797895 0.6867631 0.5310673
## 1 0.21060526 0.6513988 0.4776070
## 1 0.26323158 0.6494942 0.4723063
## 1 0.31585789 0.3478031 0.0000000
## 1 0.36848421 0.3478031 0.0000000
## 1 0.42111053 0.3478031 0.0000000
## 1 0.47373684 0.3478031 0.0000000
## 1 0.52636316 0.3478031 0.0000000
## 1 0.57898947 0.3478031 0.0000000
## 1 0.63161579 0.3478031 0.0000000
## 1 0.68424211 0.3478031 0.0000000
## 1 0.73686842 0.3478031 0.0000000
## 1 0.78949474 0.3478031 0.0000000
## 1 0.84212105 0.3478031 0.0000000
## 1 0.89474737 0.3478031 0.0000000
## 1 0.94737368 0.3478031 0.0000000
## 1 1.00000000 0.3478031 0.0000000
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were alpha = 0 and lambda = 0.1579789.
# MOST PROMISING MODEL:
results <- resamples(list(GBM=fit.gbm, SVM=modelSvm, rf=fit.rf))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: GBM, SVM, rf
## Number of resamples: 30
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## GBM 0.6486 0.7047 0.7434 0.7393 0.7568 0.8611 0
## SVM 0.5135 0.6778 0.7297 0.7171 0.7568 0.8889 0
## rf 0.6216 0.6689 0.7297 0.7207 0.7568 0.8378 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## GBM 0.4703 0.5570 0.6134 0.6085 0.6356 0.7919 0
## SVM 0.2665 0.5172 0.5941 0.5747 0.6336 0.8333 0
## rf 0.4283 0.5007 0.5941 0.5806 0.6349 0.7568 0
dotplot(results)
## + Fold01: mtry= 2
## - Fold01: mtry= 2
## + Fold01: mtry=10
## - Fold01: mtry=10
## + Fold01: mtry=18
## - Fold01: mtry=18
## + Fold02: mtry= 2
## - Fold02: mtry= 2
## + Fold02: mtry=10
## - Fold02: mtry=10
## + Fold02: mtry=18
## - Fold02: mtry=18
## + Fold03: mtry= 2
## - Fold03: mtry= 2
## + Fold03: mtry=10
## - Fold03: mtry=10
## + Fold03: mtry=18
## - Fold03: mtry=18
## + Fold04: mtry= 2
## - Fold04: mtry= 2
## + Fold04: mtry=10
## - Fold04: mtry=10
## + Fold04: mtry=18
## - Fold04: mtry=18
## + Fold05: mtry= 2
## - Fold05: mtry= 2
## + Fold05: mtry=10
## - Fold05: mtry=10
## + Fold05: mtry=18
## - Fold05: mtry=18
## + Fold06: mtry= 2
## - Fold06: mtry= 2
## + Fold06: mtry=10
## - Fold06: mtry=10
## + Fold06: mtry=18
## - Fold06: mtry=18
## + Fold07: mtry= 2
## - Fold07: mtry= 2
## + Fold07: mtry=10
## - Fold07: mtry=10
## + Fold07: mtry=18
## - Fold07: mtry=18
## + Fold08: mtry= 2
## - Fold08: mtry= 2
## + Fold08: mtry=10
## - Fold08: mtry=10
## + Fold08: mtry=18
## - Fold08: mtry=18
## + Fold09: mtry= 2
## - Fold09: mtry= 2
## + Fold09: mtry=10
## - Fold09: mtry=10
## + Fold09: mtry=18
## - Fold09: mtry=18
## + Fold10: mtry= 2
## - Fold10: mtry= 2
## + Fold10: mtry=10
## - Fold10: mtry=10
## + Fold10: mtry=18
## - Fold10: mtry=18
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 2 on full training set
#1. check the importance of each variables
vimp <- varImp(rf_model)
# Plotting "vimp"
ggplot(vimp, top = dim(vimp$importance)[1])
train_correlation <- train_fe %>% select(bone_length:has_soul,sep1:flesh_soul)
train_correlation <- cor(train_correlation)
# corrplot(train_correlation, method="circle")
# data
corrplot.mixed(train_correlation)
#cor(train_correlation)
# 2.TUNE (http://machinelearningmastery.com/tune-machine-learning-algorithms-in-r/)
set.seed(10)
rf_model <- train(
type ~ bone_length + rotting_flesh + hair_length + has_soul + hair_soul + bone_flesh + bone_hair +
bone_soul + flesh_hair + flesh_soul,
tuneLength = 3,
data = train_fe,
method = "ranger",
trControl = myControl,
importance = 'impurity'
)
## + Fold01: mtry= 2
## - Fold01: mtry= 2
## + Fold01: mtry= 6
## - Fold01: mtry= 6
## + Fold01: mtry=10
## - Fold01: mtry=10
## + Fold02: mtry= 2
## - Fold02: mtry= 2
## + Fold02: mtry= 6
## - Fold02: mtry= 6
## + Fold02: mtry=10
## - Fold02: mtry=10
## + Fold03: mtry= 2
## - Fold03: mtry= 2
## + Fold03: mtry= 6
## - Fold03: mtry= 6
## + Fold03: mtry=10
## - Fold03: mtry=10
## + Fold04: mtry= 2
## - Fold04: mtry= 2
## + Fold04: mtry= 6
## - Fold04: mtry= 6
## + Fold04: mtry=10
## - Fold04: mtry=10
## + Fold05: mtry= 2
## - Fold05: mtry= 2
## + Fold05: mtry= 6
## - Fold05: mtry= 6
## + Fold05: mtry=10
## - Fold05: mtry=10
## + Fold06: mtry= 2
## - Fold06: mtry= 2
## + Fold06: mtry= 6
## - Fold06: mtry= 6
## + Fold06: mtry=10
## - Fold06: mtry=10
## + Fold07: mtry= 2
## - Fold07: mtry= 2
## + Fold07: mtry= 6
## - Fold07: mtry= 6
## + Fold07: mtry=10
## - Fold07: mtry=10
## + Fold08: mtry= 2
## - Fold08: mtry= 2
## + Fold08: mtry= 6
## - Fold08: mtry= 6
## + Fold08: mtry=10
## - Fold08: mtry=10
## + Fold09: mtry= 2
## - Fold09: mtry= 2
## + Fold09: mtry= 6
## - Fold09: mtry= 6
## + Fold09: mtry=10
## - Fold09: mtry=10
## + Fold10: mtry= 2
## - Fold10: mtry= 2
## + Fold10: mtry= 6
## - Fold10: mtry= 6
## + Fold10: mtry=10
## - Fold10: mtry=10
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 6 on full training set
print(rf_model)
## Random Forest
##
## 371 samples
## 10 predictor
## 3 classes: 'Ghost', 'Ghoul', 'Goblin'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 333, 333, 333, 333, 334, 334, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.7167259 0.5746530
## 6 0.7195037 0.5790649
## 10 0.7192903 0.5785770
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 6.
plot(rf_model)
rf_model2 <- train(
type ~ bone_length + rotting_flesh + hair_length + has_soul + color + allfeatures + bone_flesh + sep2 +
bone_soul + flesh_hair + flesh_soul,
tuneLength = 3,
data = train_fe,
method = "ranger",
trControl = myControl,
importance = 'impurity'
)
## + Fold01: mtry= 2
## - Fold01: mtry= 2
## + Fold01: mtry= 8
## - Fold01: mtry= 8
## + Fold01: mtry=15
## - Fold01: mtry=15
## + Fold02: mtry= 2
## - Fold02: mtry= 2
## + Fold02: mtry= 8
## - Fold02: mtry= 8
## + Fold02: mtry=15
## - Fold02: mtry=15
## + Fold03: mtry= 2
## - Fold03: mtry= 2
## + Fold03: mtry= 8
## - Fold03: mtry= 8
## + Fold03: mtry=15
## - Fold03: mtry=15
## + Fold04: mtry= 2
## - Fold04: mtry= 2
## + Fold04: mtry= 8
## - Fold04: mtry= 8
## + Fold04: mtry=15
## - Fold04: mtry=15
## + Fold05: mtry= 2
## - Fold05: mtry= 2
## + Fold05: mtry= 8
## - Fold05: mtry= 8
## + Fold05: mtry=15
## - Fold05: mtry=15
## + Fold06: mtry= 2
## - Fold06: mtry= 2
## + Fold06: mtry= 8
## - Fold06: mtry= 8
## + Fold06: mtry=15
## - Fold06: mtry=15
## + Fold07: mtry= 2
## - Fold07: mtry= 2
## + Fold07: mtry= 8
## - Fold07: mtry= 8
## + Fold07: mtry=15
## - Fold07: mtry=15
## + Fold08: mtry= 2
## - Fold08: mtry= 2
## + Fold08: mtry= 8
## - Fold08: mtry= 8
## + Fold08: mtry=15
## - Fold08: mtry=15
## + Fold09: mtry= 2
## - Fold09: mtry= 2
## + Fold09: mtry= 8
## - Fold09: mtry= 8
## + Fold09: mtry=15
## - Fold09: mtry=15
## + Fold10: mtry= 2
## - Fold10: mtry= 2
## + Fold10: mtry= 8
## - Fold10: mtry= 8
## + Fold10: mtry=15
## - Fold10: mtry=15
## Aggregating results
## Selecting tuning parameters
## Fitting mtry = 8 on full training set
print(rf_model)
## Random Forest
##
## 371 samples
## 10 predictor
## 3 classes: 'Ghost', 'Ghoul', 'Goblin'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 333, 333, 333, 333, 334, 334, ...
## Resampling results across tuning parameters:
##
## mtry Accuracy Kappa
## 2 0.7167259 0.5746530
## 6 0.7195037 0.5790649
## 10 0.7192903 0.5785770
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was mtry = 6.
plot(rf_model)
# 1. TUNE http://stackoverflow.com/questions/15613332/using-caret-package-to-find-optimal-parameters-of-gbm - http://stackoverflow.com/questions/15613332/using-caret-package-to-find-optimal-parameters-of-gbm - http://stats.stackexchange.com/questions/141719/change-settings-in-the-prediction-model-caret-package
set.seed(10)
glm_model <- train(
type ~ bone_length + rotting_flesh + hair_length + has_soul + color + hair_soul + bone_flesh + bone_hair +
bone_soul + flesh_hair + flesh_soul,
method = "glmnet",
tuneGrid = expand.grid(alpha = 0:1,
lambda = seq(0.0001, 1, length = 20)),
data = train_fe,
trControl = myControl
)
## + Fold01: alpha=0, lambda=1
## - Fold01: alpha=0, lambda=1
## + Fold01: alpha=1, lambda=1
## - Fold01: alpha=1, lambda=1
## + Fold02: alpha=0, lambda=1
## - Fold02: alpha=0, lambda=1
## + Fold02: alpha=1, lambda=1
## - Fold02: alpha=1, lambda=1
## + Fold03: alpha=0, lambda=1
## - Fold03: alpha=0, lambda=1
## + Fold03: alpha=1, lambda=1
## - Fold03: alpha=1, lambda=1
## + Fold04: alpha=0, lambda=1
## - Fold04: alpha=0, lambda=1
## + Fold04: alpha=1, lambda=1
## - Fold04: alpha=1, lambda=1
## + Fold05: alpha=0, lambda=1
## - Fold05: alpha=0, lambda=1
## + Fold05: alpha=1, lambda=1
## - Fold05: alpha=1, lambda=1
## + Fold06: alpha=0, lambda=1
## - Fold06: alpha=0, lambda=1
## + Fold06: alpha=1, lambda=1
## - Fold06: alpha=1, lambda=1
## + Fold07: alpha=0, lambda=1
## - Fold07: alpha=0, lambda=1
## + Fold07: alpha=1, lambda=1
## - Fold07: alpha=1, lambda=1
## + Fold08: alpha=0, lambda=1
## - Fold08: alpha=0, lambda=1
## + Fold08: alpha=1, lambda=1
## - Fold08: alpha=1, lambda=1
## + Fold09: alpha=0, lambda=1
## - Fold09: alpha=0, lambda=1
## + Fold09: alpha=1, lambda=1
## - Fold09: alpha=1, lambda=1
## + Fold10: alpha=0, lambda=1
## - Fold10: alpha=0, lambda=1
## + Fold10: alpha=1, lambda=1
## - Fold10: alpha=1, lambda=1
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 0, lambda = 0.316 on full training set
glm_model2 <- train(
type ~ bone_length + rotting_flesh + hair_length + has_soul + color + hair_soul + bone_flesh + sep2 +
bone_soul + flesh_hair + flesh_soul,
method = "glmnet",
tuneGrid = expand.grid(alpha = 0:1,
lambda = seq(0.0001, 1, length = 20)),
data = train_fe,
trControl = myControl
)
## + Fold01: alpha=0, lambda=1
## - Fold01: alpha=0, lambda=1
## + Fold01: alpha=1, lambda=1
## - Fold01: alpha=1, lambda=1
## + Fold02: alpha=0, lambda=1
## - Fold02: alpha=0, lambda=1
## + Fold02: alpha=1, lambda=1
## - Fold02: alpha=1, lambda=1
## + Fold03: alpha=0, lambda=1
## - Fold03: alpha=0, lambda=1
## + Fold03: alpha=1, lambda=1
## - Fold03: alpha=1, lambda=1
## + Fold04: alpha=0, lambda=1
## - Fold04: alpha=0, lambda=1
## + Fold04: alpha=1, lambda=1
## - Fold04: alpha=1, lambda=1
## + Fold05: alpha=0, lambda=1
## - Fold05: alpha=0, lambda=1
## + Fold05: alpha=1, lambda=1
## - Fold05: alpha=1, lambda=1
## + Fold06: alpha=0, lambda=1
## - Fold06: alpha=0, lambda=1
## + Fold06: alpha=1, lambda=1
## - Fold06: alpha=1, lambda=1
## + Fold07: alpha=0, lambda=1
## - Fold07: alpha=0, lambda=1
## + Fold07: alpha=1, lambda=1
## - Fold07: alpha=1, lambda=1
## + Fold08: alpha=0, lambda=1
## - Fold08: alpha=0, lambda=1
## + Fold08: alpha=1, lambda=1
## - Fold08: alpha=1, lambda=1
## + Fold09: alpha=0, lambda=1
## - Fold09: alpha=0, lambda=1
## + Fold09: alpha=1, lambda=1
## - Fold09: alpha=1, lambda=1
## + Fold10: alpha=0, lambda=1
## - Fold10: alpha=0, lambda=1
## + Fold10: alpha=1, lambda=1
## - Fold10: alpha=1, lambda=1
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 1e-04 on full training set
glm_model3 <- train(
type ~ bone_length + rotting_flesh + hair_length + has_soul + color + allfeatures + bone_flesh + sep2 +
bone_soul + flesh_hair + flesh_soul,
method = "glmnet",
tuneGrid = expand.grid(alpha = 0:1,
lambda = seq(0.0001, 1, length = 20)),
data = train_fe,
trControl = myControl
)
## + Fold01: alpha=0, lambda=1
## - Fold01: alpha=0, lambda=1
## + Fold01: alpha=1, lambda=1
## - Fold01: alpha=1, lambda=1
## + Fold02: alpha=0, lambda=1
## - Fold02: alpha=0, lambda=1
## + Fold02: alpha=1, lambda=1
## - Fold02: alpha=1, lambda=1
## + Fold03: alpha=0, lambda=1
## - Fold03: alpha=0, lambda=1
## + Fold03: alpha=1, lambda=1
## - Fold03: alpha=1, lambda=1
## + Fold04: alpha=0, lambda=1
## - Fold04: alpha=0, lambda=1
## + Fold04: alpha=1, lambda=1
## - Fold04: alpha=1, lambda=1
## + Fold05: alpha=0, lambda=1
## - Fold05: alpha=0, lambda=1
## + Fold05: alpha=1, lambda=1
## - Fold05: alpha=1, lambda=1
## + Fold06: alpha=0, lambda=1
## - Fold06: alpha=0, lambda=1
## + Fold06: alpha=1, lambda=1
## - Fold06: alpha=1, lambda=1
## + Fold07: alpha=0, lambda=1
## - Fold07: alpha=0, lambda=1
## + Fold07: alpha=1, lambda=1
## - Fold07: alpha=1, lambda=1
## + Fold08: alpha=0, lambda=1
## - Fold08: alpha=0, lambda=1
## + Fold08: alpha=1, lambda=1
## - Fold08: alpha=1, lambda=1
## + Fold09: alpha=0, lambda=1
## - Fold09: alpha=0, lambda=1
## + Fold09: alpha=1, lambda=1
## - Fold09: alpha=1, lambda=1
## + Fold10: alpha=0, lambda=1
## - Fold10: alpha=0, lambda=1
## + Fold10: alpha=1, lambda=1
## - Fold10: alpha=1, lambda=1
## Aggregating results
## Selecting tuning parameters
## Fitting alpha = 1, lambda = 0.0527 on full training set
set.seed(10)
fit.gbm <- train(type~bone_length + rotting_flesh + hair_length + has_soul + color, data=train_fe, method="gbm", metric=metric, trControl=control, verbose=FALSE)
# summarize results
results <- resamples(list(glm=glm_model, rf=rf_model, rf2=rf_model2, glm2 =glm_model2, glm3=glm_model3))
summary(results)
##
## Call:
## summary.resamples(object = results)
##
## Models: glm, rf, rf2, glm2, glm3
## Number of resamples: 10
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## glm 0.6842 0.7222 0.7333 0.7438 0.7616 0.8649 0
## rf 0.6667 0.6888 0.7105 0.7195 0.7500 0.7895 0
## rf2 0.6216 0.6888 0.7260 0.7203 0.7518 0.8333 0
## glm2 0.6579 0.7095 0.7468 0.7473 0.7989 0.8286 0
## glm3 0.6316 0.7153 0.7838 0.7636 0.8015 0.8649 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## glm 0.5265 0.5833 0.5988 0.6156 0.6428 0.7965 0
## rf 0.4977 0.5337 0.5664 0.5791 0.6247 0.6837 0
## rf2 0.4301 0.5339 0.5902 0.5806 0.6276 0.7494 0
## glm2 0.4870 0.5640 0.6195 0.6204 0.6975 0.7420 0
## glm3 0.4470 0.5743 0.6740 0.6452 0.7036 0.7967 0
dotplot(results)
# test <- read.csv("test.csv", header = TRUE, sep = ",", stringsAsFactors = FALSE)
#
# ## Make predictions
# ## Reorder the data by creature ID number
# test_complete <- full[full$Dataset == 'test', ]
# test_complete <- test_complete %>%
# arrange(id)
#
# # Make predicted survival values
# my_prediction <- predict(glm_model, test_complete)
# solution <- data.frame(id = test_complete$id, Type = my_prediction)
# write.csv(solution, file = "solution.csv", row.names = FALSE)
#
# # glm_model3
# my_prediction <- predict(glm_model3, test_complete)
# solution <- data.frame(id = test_complete$id, Type = my_prediction)
# write.csv(solution, file = "glm_model3.csv", row.names = FALSE)
#
#
# # # Bayes
# type <- predict(e1071model,test)
# bayes2 <- cbind(test, type)
# #write.csv(bayes2, file = "bayes.csv")
#
# # Knn
# predictions<- predict(knnFit,test)
# results_knn <- cbind(test,predictions)
# #write.csv(results_knn, file = "knn.csv")
#
# # modelGbm
# predictions<- predict(modelGbm,test)
# results_modelGbm <- cbind(test,predictions)
# write.csv(results_modelGbm, file = "gbm.csv")
#
# # Tree
# predictions <- predict(rpartmodel,test)
# train_cv_tree<- cbind(test,predictions)
# #write.csv(train_cv_tree, file = "tree.csv")
#
# ## Combination
# type <- predict(rfmodel,test)
# RF <- cbind(test,type)
# RFprint <- RF %>% select(id, type)
# #write.csv(RFprint, file = "rf.csv")
# stackingmodel
# type <- predict(stackingmodel,test$type)
# RF <- cbind(stackingmodel,type)
# RFprint <- RF %>% select(id, type)
# # # principal component analysis
# # NOT avaible
# # library(prcomp)
# prin_comp <- train %>% select(bone_length:has_soul)
# res.pca <- prcomp(prin_comp, scale = TRUE)
#
# # Head
# head(unclass(res.pca$rotation)[, 1:4])
#
# prin_comp <- prcomp(prin_comp, scale. = T)
# print(prin_comp)
# names(res.pca)
#
# ## Variances of the principal components
#
# # The variance retained by each principal component can be obtained as follow :
#
# # Eigenvalues
# eig <- (res.pca$sdev)^2
# # Variances in percentage
# variance <- eig*100/sum(eig)
# # Cumulative variances
# cumvar <- cumsum(variance)
# train.pca <- data.frame(eig = eig, variance = variance,
# cumvariance = cumvar)
# head(train.pca)
#
# # Or extract
# library("factoextra")
# eig.val <- get_eigenvalue(res.pca)
# head(eig.val)
#
# # Variance
# barplot(train.pca[, 2], names.arg=1:nrow(train.pca),
# main = "Variances",
# xlab = "Principal Components",
# ylab = "Percentage of variances",
# col ="steelblue")
#
# # Eigenvalue
# fviz_screeplot(res.pca, ncp=10, choice="eigenvalue")
#
# fviz_pca_ind(res.pca, col.ind="cos2") +
# scale_color_gradient2(low="white", mid="blue",
# high="red", midpoint=0.50) + theme_minimal()
#
#
# fviz_pca_biplot(res.pca, geom = "text") +
# theme_minimal()